# Computations
import numpy as np
import pandas as pd
from scipy.stats import norm
# preprocessing
from sklearn.impute import SimpleImputer
import re
## progress bar
import progressbar
# Visualisation libraries
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex, clear_output
## plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
## seaborn
import seaborn as sns
## matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
from matplotlib.font_manager import FontProperties
plt.style.use('seaborn-whitegrid')
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline
## WordCloud
from wordcloud import WordCloud
import warnings
warnings.filterwarnings("ignore")
In this article, we analyze and predict customer churn for Telco Customer Churn data.
| Columns | Description |
|---|---|
| customerID | Customer ID |
| gender | Whether the customer is a male or a female |
| SeniorCitizen | Whether the customer is a senior citizen or not (1, 0) |
| Partner | Whether the customer has a partner or not (Yes, No) |
| Dependents | Whether the customer has dependents or not (Yes, No) |
| tenure | Number of months the customer has stayed with the company |
| PhoneService | Whether the customer has a phone service or not (Yes, No) |
| MultipleLines | Whether the customer has multiple lines or not (Yes, No, No phone service) |
| InternetService | Customer’s internet service provider (DSL, Fiber optic, No) |
| OnlineSecurity | Whether the customer has online security or not (Yes, No, No internet service) |
| OnlineBackup | Whether the customer has an online backup or not (Yes, No, No internet service) |
| DeviceProtection | Whether the customer has device protection or not (Yes, No, No internet service) |
| TechSupport | Whether the customer has tech support or not (Yes, No, No internet service) |
| StreamingTV | Whether the customer has streaming TV or not (Yes, No, No internet service) |
| StreamingMovies | Whether the customer has streaming movies or not (Yes, No, No internet service) |
| Contract | The contract term of the customer (Month-to-month, One year, Two years) |
| PaperlessBilling | Whether the customer has paperless billing or not (Yes, No) |
| PaymentMethod | The customer’s payment method (Electronic check, Mailed check, Bank transfer (automatic), Credit card (automatic)) |
| MonthlyCharges | The amount charged to the customer monthly |
| TotalCharges | The total amount charged to the customer |
| Churn | Whether the customer churned or not (Yes or No) |
Data = pd.read_csv('telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')
def Data_info(Inp, Only_NaN = False):
Out = Inp.dtypes.to_frame(name='Data Type').sort_values(by=['Data Type'])
Out = Out.join(Inp.isnull().sum().to_frame(name = 'Number of NaN Values'), how='outer')
Out ['Size'] = Inp.shape[0]
Out['Percentage'] = 100 - np.round(100*(Out['Number of NaN Values']/Inp.shape[0]),2)
Out.index.name = 'Features'
Out['Data Type'] = Out['Data Type'].astype(str)
if Only_NaN:
Out = Out.loc[Out['Number of NaN Values']>0]
return Out
def text_sep(txt): return re.sub(r"(\w)([A-Z])", r"\1 \2", txt)
def dtypes_group(Inp):
Temp = Inp.dtypes.to_frame(name='Data Type').sort_values(by=['Data Type'])
Out = pd.DataFrame(index =Temp['Data Type'].unique(), columns = ['Columns'])
for c in Temp['Data Type'].unique():
Out.loc[Out.index == c, 'Columns'] = [Temp.loc[Temp['Data Type'] == c].index.tolist()]
return Out
def col_details(Col):
print(Back.BLACK + Fore.CYAN + Style.NORMAL + '%s:' % text_sep(Col))
print(Style.RESET_ALL)
print('%s' % ', '.join(Data[Col].unique()))
data_info = Data_info(Data).reset_index(drop = False)
fig = px.bar(data_info, x= 'Features', y= 'Percentage', color = 'Data Type', text = 'Data Type',
color_discrete_sequence = ['PaleGreen', 'LightBlue', 'PeachPuff'], hover_data = data_info.columns)
fig.update_layout(plot_bgcolor= 'white', legend=dict(x=1, y=.5, traceorder="normal",
bordercolor="DarkGray", borderwidth=1), width = 980)
fig.update_traces(texttemplate= 6*' ' + '%{label}', textposition='inside')
fig.update_traces(marker_line_color= 'Black', marker_line_width=1., opacity=1)
fig.show()
Data.rename(columns = {'gender':'Gender', 'tenure':'Tenure'}, inplace = True)
Data.columns = [text_sep(txt) for txt in Data.columns.tolist()]
Data_types = dtypes_group(Data)
display(Data_types)
Temp = Data_types.loc[Data_types.index == 'int64'].values[0,0]
Data[Temp] = Data[Temp].astype(int)
del Temp
| Columns | |
|---|---|
| int64 | [Senior Citizen, Tenure] |
| float64 | [Monthly Charges] |
| object | [customer ID, Payment Method, Paperless Billin... |
Temp = Data_types.loc[Data_types.index == 'float64'].values[0,0]
Data[Temp] = Data[Temp].astype(float)
del Temp
Data['Total Charges'] = pd.to_numeric(Data['Total Charges'], errors='coerce')
First, let's convert all Yes/No columns using as follows
\begin{cases} 0 &\mbox{No}\\ 1 &\mbox{Yes}\end{cases}Temp = []
for i in Data_types.loc[Data_types.index == 'object'].values[0,0]:
if set(Data[i].unique().tolist()) == {'No', 'Yes'}:
Temp.append(i)
Data[Temp] = Data[Temp].replace({'Yes':1, 'No':0}).astype(int)
del Temp
However, some other columns can be converted similarly; however, we need to create a new feature.
Temp = []
for i in Data_types.loc[Data_types.index == 'object'].values[0,0]:
if set(Data[i].unique().tolist()) == {'No', 'No internet service', 'Yes'}:
Temp.append(i)
print(Back.BLACK + Fore.CYAN + Style.NORMAL + 'Columns:' +
Style.RESET_ALL + ' %s' % ', '.join(Temp))
Columns: Streaming Movies, Streaming TV, Tech Support, Device Protection, Online Backup, Online Security
Note that,
col_details('Internet Service')
Internet Service:
DSL, Fiber optic, No
This Column can be coded as follows
$$\mbox{InternetServiceType} = \begin{cases} 0 &\mbox{No} \\ 1 &\mbox{DSL}\\ 2 &\mbox{Fiber optic}\end{cases}$$def myfun(x):
if x == 'No':
return 0
elif x == 'DSL':
return 1
else:
return 2
Data['Internet Service'] = Data['Internet Service'].apply(lambda x: myfun(x)).astype(int)
del myfun
Since we have already included No interent service in InternetService, we can code the rest as,
\begin{cases} 0 &\mbox{No, No internet service}\\ 1 &\mbox{Yes}\end{cases}Data[Temp] = Data[Temp].applymap(lambda x: 1 if x =='Yes' else 0).astype(int)
Since, there is already a feature as PhoneService, for MultipleLines, we can try $$ \mbox{MultipleLines} = \begin{cases} 0 &\mbox{No, No phone service}\\ 1 &\mbox{Yes}\end{cases} $$
Data['Multiple Lines'] = Data['Multiple Lines'].map(lambda x: 1 if x =='Yes' else 0).astype(int)
Data_types = dtypes_group(Data)
Temp = Data_types.loc[Data_types.index == 'object'].values[0,0]
print('Columns: %s' %', '.join(Temp))
Columns: Contract, Gender, Payment Method, customer ID
col_details('Contract')
Contract:
Month-to-month, One year, Two year
Data['Contract'] = Data['Contract'].replace({'Month-to-month':0, 'One year':1, 'Two year':2}).astype(int)
Data['Gender'] = Data['Gender'].map(lambda x: 1 if x =='Male' else 0).astype(int)
col_details('Payment Method')
Payment Method:
Electronic check, Mailed check, Bank transfer (automatic), Credit card (automatic)
In this case, we can not rank these values. Therefore,
Data = Data.join(pd.get_dummies(Data['Payment Method']).astype(int))
Data = Data.drop(columns = ['Payment Method'])
Data_types = dtypes_group(Data)
display(Data_types)
| Columns | |
|---|---|
| int32 | [Device Protection, Credit card (automatic), B... |
| float64 | [Monthly Charges, Total Charges] |
| object | [customer ID] |
Temp = Data_info(Data, Only_NaN = True)
Temp = Temp.index.tolist()
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
Data[Temp] = imp.fit_transform(Data[Temp])
Let's take a look at the variance of the features.
display(Data.drop(columns = ['Churn']).var().sort_values(ascending = False).to_frame(name= 'Variance')\
.style.background_gradient(cmap=sns.light_palette("green", as_cmap=True)).set_precision(2))
| Variance | |
|---|---|
| Total Charges | 5130226.17 |
| Monthly Charges | 905.41 |
| Tenure | 603.17 |
| Contract | 0.70 |
| Internet Service | 0.61 |
| Gender | 0.25 |
| Partner | 0.25 |
| Multiple Lines | 0.24 |
| Paperless Billing | 0.24 |
| Streaming Movies | 0.24 |
| Streaming TV | 0.24 |
| Online Backup | 0.23 |
| Device Protection | 0.23 |
| Electronic check | 0.22 |
| Dependents | 0.21 |
| Tech Support | 0.21 |
| Online Security | 0.20 |
| Mailed check | 0.18 |
| Bank transfer (automatic) | 0.17 |
| Credit card (automatic) | 0.17 |
| Senior Citizen | 0.14 |
| Phone Service | 0.09 |
def Correlation_Plot (Df,Fig_Size):
Correlation_Matrix = Df.corr().round(2)
mask = np.zeros_like(Correlation_Matrix)
mask[np.triu_indices_from(mask)] = True
for i in range(len(mask)):
mask[i,i]=0
Fig, ax = plt.subplots(figsize=(Fig_Size,Fig_Size))
sns.heatmap(Correlation_Matrix, ax=ax, mask=mask, annot=True, square=True,
cmap =sns.color_palette("Greens", n_colors=10), linewidths = 0.2, vmin=0, vmax=1, cbar_kws={"shrink": .6})
Correlation_Plot (Data, 16)
Correlations of features with customer Churn.
Temp = Data.corr().round(2)
Temp['Churn'].sort_values().to_frame(name= 'Correlation')[:-1].style.background_gradient(cmap='RdYlGn',
subset=['Correlation']).set_precision(2)
| Correlation | |
|---|---|
| Contract | -0.40 |
| Tenure | -0.35 |
| Total Charges | -0.20 |
| Online Security | -0.17 |
| Tech Support | -0.16 |
| Dependents | -0.16 |
| Partner | -0.15 |
| Credit card (automatic) | -0.13 |
| Bank transfer (automatic) | -0.12 |
| Mailed check | -0.09 |
| Online Backup | -0.08 |
| Device Protection | -0.07 |
| Gender | -0.01 |
| Phone Service | 0.01 |
| Multiple Lines | 0.04 |
| Streaming TV | 0.06 |
| Streaming Movies | 0.06 |
| Senior Citizen | 0.15 |
| Paperless Billing | 0.19 |
| Monthly Charges | 0.19 |
| Electronic check | 0.30 |
| Internet Service | 0.32 |
fig, ax = plt.subplots(nrows=1, ncols=3, figsize = (16, 6))
Temp = ['Tenure','Monthly Charges','Total Charges']
for i in range(len(Temp)):
_ = sns.distplot(Data[Temp[i]],
fit=norm, kde=False, color='seagreen', ax= ax[i])
Data.to_csv('telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn_clean.csv', index=False)